#loading packages
library(ezids)
library(ggplot2)
library(ggrepel)
library(gridExtra)
library(tibble)
library(dplyr)
library(tidyr)
library(psych)
library(corrplot)
library(lattice)
library(FNN)
library(gmodels)
library(caret)
#loading data
NYweath <- data.frame(read.csv("data/NYC_weather_1869_2022.csv"))
#converting to R date format and adding columns for day, month, and year
NYweath$DATE <- as.Date(NYweath$DATE)
NYweath$day <- format(NYweath$DATE, format="%d")
NYweath$month <- format(NYweath$DATE, format="%m")
NYweath$year <- format(NYweath$DATE, format="%Y")
#converting temperature observations to numerical
NYweath$TMAX <- as.numeric(NYweath$TMAX)
NYweath$TMIN <- as.numeric(NYweath$TMIN)
NYweath$TAVG <- as.numeric(NYweath$TAVG)
NYweath$year <- as.numeric(NYweath$year)
#Making month a factor
NYweath$month <- as.factor(NYweath$month)
# subset data to desired variables
NYweath_sub <- subset(NYweath, select = c(DATE, day, month, year, TMAX, TMIN, TAVG, PRCP, SNOW))
#creating a subset for 1900 on
NYweath_00 <- subset(NYweath_sub, year > 1899)
xkabledplyhead(NYweath_00)
| DATE | day | month | year | TMAX | TMIN | TAVG | PRCP | SNOW | |
|---|---|---|---|---|---|---|---|---|---|
| 11265 | 1900-01-01 | 01 | 01 | 1900 | 20 | 14 | NA | 0.03 | 0.5 |
| 11266 | 1900-01-02 | 02 | 01 | 1900 | 23 | 13 | NA | 0.00 | 0.0 |
| 11267 | 1900-01-03 | 03 | 01 | 1900 | 26 | 19 | NA | 0.00 | 0.0 |
| 11268 | 1900-01-04 | 04 | 01 | 1900 | 32 | 19 | NA | 0.00 | 0.0 |
| 11269 | 1900-01-05 | 05 | 01 | 1900 | 39 | 29 | NA | 0.00 | 0.0 |
#new data loading from Emily, New York State population from 1900 on
NYSPop_1900 <- data.frame(read.csv("data/NYSPop_1900-2021.csv"))
NYSPop_1900$Population <- as.numeric(NYSPop_1900$Population)
NYSPop_1900$Year <- as.numeric(NYSPop_1900$Year)
#creating a subset for 1900-2021
NYweath_00a <- subset(NYweath_sub, year > 1899)
NYweath_00a <- subset(NYweath_00a, year < 2022)
for(i in 1:length(NYweath_00a$year)){
(NYweath_00a$Pop[i]= NYSPop_1900$Population[(which(NYSPop_1900$Year == NYweath_00a$year[i]))]
)}
#New data loading from Emily, shootings
NYshoot <- data.frame(read.csv("data/Shooting_Counts_ERG.csv"))
#converting to R date format and adding columns for day, month, and year
NYshoot$DATE <- as.Date(NYshoot$Date, format = "%m/%d/%Y")
#cleaning shooting data, merging with NYweath_00a
NYweathshoot_06 <- subset (NYweath_00a, year > 2005)
NYweathshoot_06 <- subset (NYweathshoot_06, year < 2022)
#str(NYweathshoot_06)
#str(NYshoot)
NYweathshoot_06 <- full_join(NYshoot, NYweathshoot_06, by = "DATE")
NYweathshoot_06$day <- format(NYweathshoot_06$DATE, format="%d")
NYweathshoot_06$month <- format(NYweathshoot_06$DATE, format="%m")
NYweathshoot_06$year <- format(NYweathshoot_06$DATE, format="%Y")
NYweathshoot_06$day <- as.numeric(NYweathshoot_06$day)
NYweathshoot_06$month <- as.factor(NYweathshoot_06$month)
NYweathshoot_06$year <- as.numeric(NYweathshoot_06$year, format="%Y")
NYweathshoot_06 <- NYweathshoot_06 %>% mutate(Shootings = ifelse(is.na(Shootings), 0, Shootings))
summary(NYweathshoot_06)
## Date Shootings Murders DATE
## Length:5844 Min. : 0.0 Min. : 0 Min. :2006-01-01
## Class :character 1st Qu.: 2.0 1st Qu.: 0 1st Qu.:2009-12-31
## Mode :character Median : 4.0 Median : 0 Median :2013-12-31
## Mean : 4.4 Mean : 1 Mean :2013-12-31
## 3rd Qu.: 6.0 3rd Qu.: 1 3rd Qu.:2017-12-31
## Max. :47.0 Max. :12 Max. :2021-12-31
## NA's :435
## day month year TMAX TMIN
## Min. : 1.0 01 : 496 Min. :2006 Min. : 13.0 Min. :-1.0
## 1st Qu.: 8.0 03 : 496 1st Qu.:2010 1st Qu.: 48.0 1st Qu.:36.0
## Median :16.0 05 : 496 Median :2014 Median : 65.0 Median :49.0
## Mean :15.7 07 : 496 Mean :2014 Mean : 63.3 Mean :49.1
## 3rd Qu.:23.0 08 : 496 3rd Qu.:2017 3rd Qu.: 79.0 3rd Qu.:64.0
## Max. :31.0 10 : 496 Max. :2021 Max. :104.0 Max. :84.0
## (Other):2868
## TAVG PRCP SNOW Pop
## Min. : NA Min. :0.00 Min. : 0.00 Min. :19104631
## 1st Qu.: NA 1st Qu.:0.00 1st Qu.: 0.00 1st Qu.:19376734
## Median : NA Median :0.00 Median : 0.00 Median :19574362
## Mean :NaN Mean :0.14 Mean : 0.09 Mean :19524848
## 3rd Qu.: NA 3rd Qu.:0.06 3rd Qu.: 0.00 3rd Qu.:19640651
## Max. : NA Max. :7.57 Max. :27.30 Max. :20154933
## NA's :5844
CW ADD: Adding a ‘TOT_PRCP’ row that sums up the total precipitation between SNOW and PRCP. This row will be used in Question 3.
NYweath_final <- NYweath_00
NYweath_final$TOT_PRCP <- NYweath_00$PRCP + NYweath_00$SNOW
In this project, we are digging into the relationship between human activity and weather in New York city. Our three driving questions are:
How do changes in NYC weather patterns correlate to changes in population and economic activity over the same time frame?
Are there statistically measurable changes in NYC air quality over time, and are they correlated to changes in daily maximum temperature observed in previous analysis?
How do changes in weather patterns correlate to other local human activity, such as crime, reported COVID cases, and stock market performance?
Emily will re-do linear regression looking at measures of local and global human activity as regressors rather than year. She might also look into variable transformations (i.e., linear models fit to polynomials of regressors) to see if the response is best fit as linear or polynomial.
At the end of our exploratory data analysis, we developed a linear model of maximum daily temperature over time, with year as a linear regressor. This revealed to us that there is a statistically significant increase in average maximum temperatures over time. However, we do not suspect that time is the cause– rather, it is something else that has changed over time that has caused the warming in New York. We wanted to explore correlations with other, more direct proxies for human activity.
Our original fit used year as a numerical regressor and month as a categorical regressor. The resulting fit has an r-squared value of 0.775 and a slope of 0.025 degrees Fahrenheit per year, with all fit parameters’ p-values well below 0.05. The different intercepts for the each level of the categorical variable (the twelve months of the year) indicated that January is the coldest and July the hottest month in Central Park, with an average difference in maximum daily temperature of approximately 46 degrees Fahrenheit in any given year over this window.
maxTfit00_ym <- lm(formula = TMAX ~ year + month, data = NYweath_00a )
res00_ym <- residuals(maxTfit00_ym)
summary(maxTfit00_ym)
##
## Call:
## lm(formula = TMAX ~ year + month, data = NYweath_00a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.37 -5.88 -0.16 5.63 37.50
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -10.78315 2.34463 -4.6 4.3e-06 ***
## year 0.02527 0.00119 21.2 < 2e-16 ***
## month02 1.35817 0.20898 6.5 8.2e-11 ***
## month03 10.13406 0.20406 49.7 < 2e-16 ***
## month04 21.45601 0.20575 104.3 < 2e-16 ***
## month05 32.29032 0.20406 158.2 < 2e-16 ***
## month06 40.85601 0.20575 198.6 < 2e-16 ***
## month07 46.00370 0.20406 225.4 < 2e-16 ***
## month08 44.14146 0.20406 216.3 < 2e-16 ***
## month09 37.44016 0.20575 182.0 < 2e-16 ***
## month10 26.46801 0.20406 129.7 < 2e-16 ***
## month11 14.59563 0.20575 70.9 < 2e-16 ***
## month12 3.71576 0.20406 18.2 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.87 on 44547 degrees of freedom
## Multiple R-squared: 0.773, Adjusted R-squared: 0.773
## F-statistic: 1.26e+04 on 12 and 44547 DF, p-value: <2e-16
The two extremes and their linear models are plotted in the following figure.
#plot of just July and January
ggplot(NYweath_00a, aes(x = year, y = TMAX, color = month)) +
geom_point() +
scale_color_manual(values = c("01" = "purple4",
"07" = "red"), na.value = NA) +
geom_abline(aes(intercept = -11.05508, slope = 0.02539), col = "black", size = 1) +
geom_abline(aes(intercept = 34.98295, slope = 0.02539), col = "black", size = 1) +
labs(
x = "Year",
y = "Maximum Daily Temperature",
title = "Maximum Daily Temperature in Central Park") +
xlab(label = "Year") +
ylab(label = "Maximum daily temperature") +
ggtitle(label = "Maximum Daily Temperature in Central Park")
Do other weather variables correlate to TMAX?
NYweath_cor <- subset(NYweath_00a, select = c(year, TMAX, PRCP, SNOW))
str(NYweath_cor)
## 'data.frame': 44560 obs. of 4 variables:
## $ year: num 1900 1900 1900 1900 1900 1900 1900 1900 1900 1900 ...
## $ TMAX: num 20 23 26 32 39 40 43 40 33 41 ...
## $ PRCP: num 0.03 0 0 0 0 0 0 0.01 0 0 ...
## $ SNOW: num 0.5 0 0 0 0 0 0 0 0 0 ...
weathcor <- cor(NYweath_cor, use = "pairwise.complete.obs")
corrplot::corrplot(weathcor)
cor
## function (x, y = NULL, use = "everything", method = c("pearson",
## "kendall", "spearman"))
## {
## na.method <- pmatch(use, c("all.obs", "complete.obs", "pairwise.complete.obs",
## "everything", "na.or.complete"))
## if (is.na(na.method))
## stop("invalid 'use' argument")
## method <- match.arg(method)
## if (is.data.frame(y))
## y <- as.matrix(y)
## if (is.data.frame(x))
## x <- as.matrix(x)
## if (!is.matrix(x) && is.null(y))
## stop("supply both 'x' and 'y' or a matrix-like 'x'")
## if (!(is.numeric(x) || is.logical(x)))
## stop("'x' must be numeric")
## stopifnot(is.atomic(x))
## if (!is.null(y)) {
## if (!(is.numeric(y) || is.logical(y)))
## stop("'y' must be numeric")
## stopifnot(is.atomic(y))
## }
## Rank <- function(u) {
## if (length(u) == 0L)
## u
## else if (is.matrix(u)) {
## if (nrow(u) > 1L)
## apply(u, 2L, rank, na.last = "keep")
## else row(u)
## }
## else rank(u, na.last = "keep")
## }
## if (method == "pearson")
## .Call(C_cor, x, y, na.method, FALSE)
## else if (na.method %in% c(2L, 5L)) {
## if (is.null(y)) {
## .Call(C_cor, Rank(na.omit(x)), NULL, na.method, method ==
## "kendall")
## }
## else {
## nas <- attr(na.omit(cbind(x, y)), "na.action")
## dropNA <- function(x, nas) {
## if (length(nas)) {
## if (is.matrix(x))
## x[-nas, , drop = FALSE]
## else x[-nas]
## }
## else x
## }
## .Call(C_cor, Rank(dropNA(x, nas)), Rank(dropNA(y,
## nas)), na.method, method == "kendall")
## }
## }
## else if (na.method != 3L) {
## x <- Rank(x)
## if (!is.null(y))
## y <- Rank(y)
## .Call(C_cor, x, y, na.method, method == "kendall")
## }
## else {
## if (is.null(y)) {
## ncy <- ncx <- ncol(x)
## if (ncx == 0)
## stop("'x' is empty")
## r <- matrix(0, nrow = ncx, ncol = ncy)
## for (i in seq_len(ncx)) {
## for (j in seq_len(i)) {
## x2 <- x[, i]
## y2 <- x[, j]
## ok <- complete.cases(x2, y2)
## x2 <- rank(x2[ok])
## y2 <- rank(y2[ok])
## r[i, j] <- if (any(ok))
## .Call(C_cor, x2, y2, 1L, method == "kendall")
## else NA
## }
## }
## r <- r + t(r) - diag(diag(r))
## rownames(r) <- colnames(x)
## colnames(r) <- colnames(x)
## r
## }
## else {
## if (length(x) == 0L || length(y) == 0L)
## stop("both 'x' and 'y' must be non-empty")
## matrix_result <- is.matrix(x) || is.matrix(y)
## if (!is.matrix(x))
## x <- matrix(x, ncol = 1L)
## if (!is.matrix(y))
## y <- matrix(y, ncol = 1L)
## ncx <- ncol(x)
## ncy <- ncol(y)
## r <- matrix(0, nrow = ncx, ncol = ncy)
## for (i in seq_len(ncx)) {
## for (j in seq_len(ncy)) {
## x2 <- x[, i]
## y2 <- y[, j]
## ok <- complete.cases(x2, y2)
## x2 <- rank(x2[ok])
## y2 <- rank(y2[ok])
## r[i, j] <- if (any(ok))
## .Call(C_cor, x2, y2, 1L, method == "kendall")
## else NA
## }
## }
## rownames(r) <- colnames(x)
## colnames(r) <- colnames(y)
## if (matrix_result)
## r
## else drop(r)
## }
## }
## }
## <bytecode: 0x7fb7c5bdc9b0>
## <environment: namespace:stats>
We have found a reasonable linear model for temperature over time, but can we look instead at the connection to human activities, rather than time? Can we use some aspect of human activity as a regressor and generate a reasonable model?
We looked to the Census for U.S. population data, but that is only reported decennially, so we looked for other sources. We found historical data back to 1960 for New York state online https://www.macrotrends.net/cities/23083/new-york-city/population. Because this source is not known to us, we validated it against decennial census data.
A bunch of linear models…
#LM1
maxTfit00_m <- lm(formula = TMAX ~ month, data = NYweath_00a)
summary(maxTfit00_m)
##
## Call:
## lm(formula = TMAX ~ month, data = NYweath_00a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -40.47 -5.90 -0.19 5.78 37.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.754 0.145 267.24 <2e-16 ***
## month02 1.358 0.210 6.47 1e-10 ***
## month03 10.134 0.205 49.41 <2e-16 ***
## month04 21.456 0.207 103.76 <2e-16 ***
## month05 32.290 0.205 157.45 <2e-16 ***
## month06 40.856 0.207 197.58 <2e-16 ***
## month07 46.004 0.205 224.32 <2e-16 ***
## month08 44.141 0.205 215.24 <2e-16 ***
## month09 37.440 0.207 181.06 <2e-16 ***
## month10 26.468 0.205 129.06 <2e-16 ***
## month11 14.596 0.207 70.58 <2e-16 ***
## month12 3.716 0.205 18.12 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.92 on 44548 degrees of freedom
## Multiple R-squared: 0.771, Adjusted R-squared: 0.771
## F-statistic: 1.36e+04 on 11 and 44548 DF, p-value: <2e-16
#LM4
maxTfit00_all <- lm(formula = TMAX ~ year + month + PRCP, data = NYweath_00a)
summary(maxTfit00_all)
##
## Call:
## lm(formula = TMAX ~ year + month + PRCP, data = NYweath_00a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.50 -5.88 -0.19 5.60 37.35
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -11.12661 2.34225 -4.75 2.0e-06 ***
## year 0.02551 0.00119 21.39 < 2e-16 ***
## month02 1.36397 0.20874 6.53 6.5e-11 ***
## month03 10.15636 0.20384 49.82 < 2e-16 ***
## month04 21.47723 0.20553 104.49 < 2e-16 ***
## month05 32.30716 0.20384 158.49 < 2e-16 ***
## month06 40.87389 0.20553 198.87 < 2e-16 ***
## month07 46.03686 0.20386 225.83 < 2e-16 ***
## month08 44.17913 0.20387 216.71 < 2e-16 ***
## month09 37.46350 0.20554 182.27 < 2e-16 ***
## month10 26.48077 0.20384 129.91 < 2e-16 ***
## month11 14.60290 0.20553 71.05 < 2e-16 ***
## month12 3.72867 0.20384 18.29 < 2e-16 ***
## PRCP -1.18037 0.11749 -10.05 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.86 on 44546 degrees of freedom
## Multiple R-squared: 0.773, Adjusted R-squared: 0.773
## F-statistic: 1.17e+04 on 13 and 44546 DF, p-value: <2e-16
#maxTfit00_all_intrxn <- lm(formula = TMAX ~ year + month*day + PRCP + SNOW, data = NYweath_00a)
#summary(maxTfit00_all)
#anova(maxTfit00_m, maxTfit00_ym)
#anova(maxTfit00_all, maxTfit00_all_intrxn)
#LM2
maxTfit00_pop <- lm(formula = TMAX ~ Pop + month, data = NYweath_00a)
summary(maxTfit00_pop)
##
## Call:
## lm(formula = TMAX ~ Pop + month, data = NYweath_00a)
##
## Residuals:
## Min 1Q Median 3Q Max
## -39.23 -5.86 -0.17 5.65 37.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.51e+01 2.21e-01 158.7 < 2e-16 ***
## Pop 2.38e-07 1.11e-08 21.5 < 2e-16 ***
## month02 1.36e+00 2.09e-01 6.5 8.1e-11 ***
## month03 1.01e+01 2.04e-01 49.7 < 2e-16 ***
## month04 2.15e+01 2.06e-01 104.3 < 2e-16 ***
## month05 3.23e+01 2.04e-01 158.3 < 2e-16 ***
## month06 4.09e+01 2.06e-01 198.6 < 2e-16 ***
## month07 4.60e+01 2.04e-01 225.5 < 2e-16 ***
## month08 4.41e+01 2.04e-01 216.3 < 2e-16 ***
## month09 3.74e+01 2.06e-01 182.0 < 2e-16 ***
## month10 2.65e+01 2.04e-01 129.7 < 2e-16 ***
## month11 1.46e+01 2.06e-01 71.0 < 2e-16 ***
## month12 3.72e+00 2.04e-01 18.2 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 8.87 on 44547 degrees of freedom
## Multiple R-squared: 0.773, Adjusted R-squared: 0.773
## F-statistic: 1.26e+04 on 12 and 44547 DF, p-value: <2e-16
#maxTfit00_pop_all <- lm(formula = TMAX ~ Pop + month + PRCP, data = NYweath_00a)
#summary(maxTfit00_pop)
#plot of NYS Pop over time
ggplot(NYweath_00a, aes(x = year, y = Pop)) +
geom_point() +
# geom_abline(aes(intercept = -11.05508, slope = 0.02539), col = "black", size = 1) +
labs(
x = "Year",
y = "New York State Population",
title = "Annual Population of New York State") +
xlab(label = "Year") +
ylab(label = "New York State Population") +
ggtitle(label = "Annual Population in New York State")
Let’s pull in our air quality data. It contains measurements of daily PM2.5 and air quality index values taken from various locations around Manhattan.
PM2.5 includes particles less than or equal to 2.5 micrometers and is also called fine particle pollution. The AQI is an index for reporting daily air quality. It tells how clean or polluted the air is, and what associated health effects might be a concern, especially for ground-level ozone and particle pollution.
Load the new data and have a look at its structure:
DailyAQ_00_22 <- data.frame(read.csv("data/daily-AQ-NY-00-20.csv"))
DailyAQ_00_22 <- DailyAQ_00_22[c('Date', 'Daily.Mean.PM2.5.Concentration', 'DAILY_AQI_VALUE')]
colnames(DailyAQ_00_22) <- c('DATE', 'PM2.5', 'AQI')
str(DailyAQ_00_22)
## 'data.frame': 7097 obs. of 3 variables:
## $ DATE : chr "1/1/00" "1/2/00" "1/3/00" "1/4/00" ...
## $ PM2.5: num 39.8 41.3 30.8 16.4 7.8 23.1 18.2 27 19.4 9.7 ...
## $ AQI : int 112 115 90 60 33 74 64 82 66 40 ...
xkablesummary(DailyAQ_00_22)
| DATE | PM2.5 | AQI | |
|---|---|---|---|
| Min | Length:7097 | Min. :-1.2 | Min. : 0 |
| Q1 | Class :character | 1st Qu.: 5.9 | 1st Qu.: 25 |
| Median | Mode :character | Median : 9.0 | Median : 38 |
| Mean | NA | Mean :10.9 | Mean : 41 |
| Q3 | NA | 3rd Qu.:13.6 | 3rd Qu.: 54 |
| Max | NA | Max. :86.0 | Max. :167 |
xkabledplyhead(DailyAQ_00_22)
| DATE | PM2.5 | AQI |
|---|---|---|
| 1/1/00 | 39.8 | 112 |
| 1/2/00 | 41.3 | 115 |
| 1/3/00 | 30.8 | 90 |
| 1/4/00 | 16.4 | 60 |
| 1/5/00 | 7.8 | 33 |
We need to convert the date from a character string to an R type. We also calculate year-over-year growth rates for both daily PM2.5 and AQI and store them in a column.
We have about 7,000 observations between the years 2000 and 2022. A few plots to help us visualize the data:
# distribution plot of pmi2.5 and daily AQI
mean_pm25 <- mean(DailyAQ_00_22$PM2.5)
mean_aqi <- mean(DailyAQ_00_22$AQI)
# TODO: combine plots into one frame
ggplot(DailyAQ_00_22) +
geom_histogram(aes(x=PM2.5), na.rm=TRUE, alpha=0.5, color="black", fill='#BD2AE2', bins=100, binwidth=2) +
geom_vline(xintercept=mean_pm25, color="black", size=1, linetype=5, show.legend=FALSE) +
annotate("text", x=mean_pm25 + 9, y=1000, label=paste(round(mean_pm25, 2)), angle=0, size=4, color="black") +
labs(title="Distribution of Daily PM2.5 Measurements", x="ug/m3 LC", y="Count")
ggplot(DailyAQ_00_22) +
geom_histogram(aes(x=AQI), na.rm=TRUE, alpha=0.5, color="black", fill='#2DD164', bins=50, binwidth=5) +
geom_vline(xintercept=mean_aqi, color="black", size=1, linetype=5, show.legend=FALSE) +
annotate("text", x=mean_aqi + 9, y=625, label=paste(round(mean_aqi, 2)), angle=0, size=4, color="black") +
labs(title="Distribution of Daily AQI Level", x="", y="Count")
# TODO: group these in same figure, separate plots
ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
geom_line(aes(x = year, y = pm2.5_diffRate), na.rm = T, stat = "identity", color="#290DDA", size=1) +
geom_point(aes(x = year, y = pm2.5_diffRate), na.rm = TRUE, fill="#124CF2", shape = 23) +
labs(title="PM2.5 particulate year-over-year rate in NYC", x="Year", y="ug/m3 LC") +
theme(
axis.title.y = element_text(color = "#043008", size = 13),
axis.title.y.right = element_text(color = "#E6E930", size = 13)
)
ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
geom_line(aes(x = year, y = aqi_diffRate), na.rm = T, stat="identity", color="#043008", size=1) +
geom_point(aes(x = year, y = aqi_diffRate), na.rm = TRUE, fill="#E6E930", shape = 23) +
labs(title="AQI year-over-year rate in NYC", x="Year", y="AQI") +
theme(
axis.title.y = element_text(color = "#043008", size = 13),
axis.title.y.right = element_text(color = "#E6E930", size = 13)
)
# ggplot(DailyAQ_00_22_Yearly_Growth, aes(group=1)) +
# labs(title="AQI and growth rate and growht/decay rate by year in NYC", x="Year", y="ug/m3 LC") +
# scale_y_continuous(sec.axis = sec_axis(~., name="Year-over-year Diff (%)")) +
# theme(
# axis.title.y = element_text(color = "#2DD164", size = 13),
# axis.title.y.right = element_text(color = "#E6E930", size = 13)
# )
Next, we combine our new dataset with the NYC weather data based on the date. The days without a matching air quality measurement will be dropped after merge.
# merge data frame by date
DailyAQ_merged <- merge(DailyAQ_00_22, NYweath_00, by="DATE")
# select required columns
DailyAQ_merged <- DailyAQ_merged[ , c('DATE', 'year.x', 'month', 'PM2.5', 'AQI', 'TMAX', 'TMIN', 'PRCP', 'SNOW')]
colnames(DailyAQ_merged)[2] <- "year"
str(DailyAQ_merged)
## 'data.frame': 7063 obs. of 9 variables:
## $ DATE : Date, format: "2000-01-01" "2000-01-02" ...
## $ year : num 2000 2000 2000 2000 2000 2000 2000 2000 2000 2000 ...
## $ month: Factor w/ 12 levels "01","02","03",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ PM2.5: num 39.8 41.3 30.8 16.4 7.8 23.1 18.2 27 19.4 9.7 ...
## $ AQI : int 112 115 90 60 33 74 64 82 66 40 ...
## $ TMAX : num 50 60 64 60 47 49 38 51 58 52 ...
## $ TMIN : num 34 43 51 46 29 35 30 37 44 40 ...
## $ PRCP : num 0 0 0 0.7 0 0 0 0.02 0.84 0 ...
## $ SNOW : num 0 0 0 0 0 0 0 0 0 0 ...
xkablesummary(DailyAQ_merged)
| DATE | year | month | PM2.5 | AQI | TMAX | TMIN | PRCP | SNOW | |
|---|---|---|---|---|---|---|---|---|---|
| Min | Min. :2000-01-01 | Min. :2000 | 03 : 634 | Min. :-1.2 | Min. : 0.0 | Min. : 13.0 | Min. :-1.0 | Min. :0.00 | Min. : 0.00 |
| Q1 | 1st Qu.:2007-02-06 | 1st Qu.:2007 | 01 : 617 | 1st Qu.: 5.9 | 1st Qu.: 25.0 | 1st Qu.: 48.0 | 1st Qu.:36.0 | 1st Qu.:0.00 | 1st Qu.: 0.00 |
| Median | Median :2012-03-16 | Median :2012 | 05 : 615 | Median : 9.1 | Median : 38.0 | Median : 64.0 | Median :48.0 | Median :0.00 | Median : 0.00 |
| Mean | Mean :2011-12-20 | Mean :2011 | 04 : 611 | Mean :10.9 | Mean : 41.1 | Mean : 62.6 | Mean :48.4 | Mean :0.13 | Mean : 0.09 |
| Q3 | 3rd Qu.:2017-05-27 | 3rd Qu.:2017 | 12 : 610 | 3rd Qu.:13.6 | 3rd Qu.: 54.0 | 3rd Qu.: 79.0 | 3rd Qu.:63.0 | 3rd Qu.:0.05 | 3rd Qu.: 0.00 |
| Max | Max. :2022-09-26 | Max. :2022 | 07 : 579 | Max. :86.0 | Max. :167.0 | Max. :103.0 | Max. :83.0 | Max. :7.57 | Max. :27.30 |
| NA | NA | NA | (Other):3397 | NA | NA | NA | NA | NA | NA |
# subset to numerical variables
DailyAQ_numerics <- DailyAQ_merged[ , c('PM2.5', 'AQI', 'TMAX', 'TMIN', 'PRCP', 'SNOW', 'year')]
# combine PRCP and SNOW into single value
#DailyAQ_numerics$PRCP <- DailyAQ_numerics$PRCP + DailyAQ_numerics$SNOW
#DailyAQ_numerics <- subset(DailyAQ_numerics, select = -c(SNOW))
DailyAQ_numerics$year <- DailyAQ_numerics$year - 2000
Lattice pairs plot
pairs(DailyAQ_numerics)
pairs.panels(DailyAQ_numerics,
method = "pearson", # correlation method
hist.col = "red", # set histogram color
density = TRUE, # show density plots
ellipses = TRUE, # show correlation ellipses
smoother = TRUE,
lm = TRUE,
main = "Pairs Plot Of Weather and Air Quality Numerical Variables",
cex.labels=0.75
)
Another way to look at correlation using the corrplot
function:
DailyAQ_cor <- cor(DailyAQ_numerics)
corrplot(DailyAQ_cor, method="number", title="Correlation Plot Of Weather and Air Quality Numerical Variables", mar = c(2, 2, 2, 2))
From the pearson correlation plot above, we can see a significantly large, positive correlation between PM2.5 concentrations and the daily AQI values. This is expected as PM2.5 are heavily weighed in calculations of AQI. Unfortunately, the correlation significance among our weather and air quality variables is relatively weak. However, we will still attempt a linear model between them below.
# yearly average and year-over year growth of daily AQI and PM2.5
ggplot(DailyAQ_00_22_Yearly_Growth) +
geom_line(aes(x = year, y = aqi_avg), stat="identity", color="#2DD164", size=1) +
geom_point(aes(x = year, y = aqi_avg), na.rm = TRUE, fill="#457108", shape = 21) +
labs(title="Average AQI by year in NYC", x="Year", y="AQI value")
ggplot(DailyAQ_00_22_Yearly_Growth) +
geom_line(aes(x = year, y = pm2.5_avg), stat="identity", color="#BD2AE2", size=1) +
geom_point(aes(x = year, y = pm2.5_avg), na.rm = TRUE, fill="#124CF2", shape = 21) +
labs(title="Average PM2.5 particulate amount by year in NYC", x="Year", y="Year-over-year Diff (%)")
Let’s start by creating a linear model to describe the relationship between AQI and year.
aqi_fit <- lm(AQI ~ year, data = DailyAQ_numerics)
summary(aqi_fit)
##
## Call:
## lm(formula = AQI ~ year, data = DailyAQ_numerics)
##
## Residuals:
## Min 1Q Median 3Q Max
## -50.34 -13.11 -1.49 11.06 126.83
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 61.4369 0.4497 136.6 <2e-16 ***
## year -1.7748 0.0342 -51.9 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 18.4 on 7061 degrees of freedom
## Multiple R-squared: 0.276, Adjusted R-squared: 0.276
## F-statistic: 2.69e+03 on 1 and 7061 DF, p-value: <2e-16
xkabledply(aqi_fit, title = paste("First Linear Model: ", format( formula(aqi_fit) )))
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 61.44 | 0.4497 | 136.6 | 0 |
| year | -1.77 | 0.0342 | -51.8 | 0 |
The coefficient for the date regressor is significant, and has a negative effect on daily AQI by a very small factor of .005. Although the p-value of the F-statistic is significant, date still only explains 28% of the variability in daily AQI measurements.
ggplot(DailyAQ_00_22, aes(x = year, y = AQI)) +
geom_point(alpha = 0.5, color = "#2DD164", position = "jitter") +
labs(x = "Year", y = "AQI Value", title = "Daily AQI Values From 2000-2022 With Trend Line") +
geom_smooth(method = 'lm', formula = 'y ~ x', color = "black", fill="black")
The plot displays a slightly downward treng in daily AQI, but there is a lot of noise distorting the fit.
month as a categorical regressorIn our first analysis, we analyzed linear trends of TMAX over time and determined a slight positive correlation observed over the years 1900-2022. Based on that fit, we hypothesized that seasonality trends had an impact on model performance.
We believe seasonality also effects daily AQI measurements.
# NYC weather - Avg TMAX by month
NYweath_Monthly_Avg <- NYweath_00 %>%
group_by(month) %>%
summarize(avg_max_temp = mean(TMAX, na.rm=T),
avg_min_temp = mean(TMIN, na.rm=T))
ggplot(NYweath_Monthly_Avg, aes(x = as.numeric(month), y = avg_max_temp)) +
geom_line(color="#F21E1E", size = 2) +
geom_point(na.rm = TRUE, fill="#126BF4", shape = 21, size = 4) +
labs(title="Average TMAX By Month in NYC", x="Month", y="Temperature (°F)") +
scale_x_continuous(name = "Month",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))
DailyAQ_monthly <- DailyAQ_merged %>%
group_by(month) %>%
summarize(pm2.5_avg = mean(PM2.5, na.rm=T),
aqi_avg = mean(AQI, na.rm=T))
# calculate growth/decay rates month-over-month
DailyAQ_monthly <- DailyAQ_monthly %>%
mutate(pm2.5_diffRate = ((pm2.5_avg - lag(pm2.5_avg)) / pm2.5_avg) * 100,
aqi_diffRate = ((aqi_avg - lag(aqi_avg)) / aqi_avg) * 100
)
# populate January rates based on December
DailyAQ_monthly[1, 4]$pm2.5_diffRate <- ((DailyAQ_monthly$pm2.5_avg[1] - DailyAQ_monthly$pm2.5_avg[12]) / DailyAQ_monthly$pm2.5_avg[1]) * 100
DailyAQ_monthly[1, 5]$aqi_diffRate <- ((DailyAQ_monthly$aqi_avg[1] - DailyAQ_monthly$aqi_avg[12]) / DailyAQ_monthly$aqi_avg[1]) * 100
# yearly average and year-over year growth of daily AQI and PM2.5
# TODO: combine with month-over-month change plot
ggplot(DailyAQ_monthly, aes(x = as.numeric(month), y = aqi_avg)) +
geom_line(color="#47ABE9", size = 2) +
geom_point(na.rm = TRUE, fill="#C10808", shape = 21, size = 4) +
labs(title="Average AQI By Month in NYC", x="Month", y="AQI") +
scale_x_continuous(name = "Month",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))
ggplot(DailyAQ_monthly, aes(x = as.numeric(month), y = aqi_diffRate)) +
geom_line(na.rm = TRUE, stat="identity", color="#043008", size=2) +
geom_point(na.rm = TRUE, fill="#E6E930", shape = 21) +
labs(title="Average AQI month-over-month change rate", x="Month", y="AQI") +
scale_x_continuous(name = "Month",
breaks = c(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12))
Let’s modify our last model to attempt to fit seasonality by adding
month as a categorical regressor and our
variable-of-interest from the last project - TMAX - to
predict daily AQI.
aqi_fit2 <- lm(AQI ~ TMAX + month, data = DailyAQ_merged)
summary(aqi_fit2)
##
## Call:
## lm(formula = AQI ~ TMAX + month, data = DailyAQ_merged)
##
## Residuals:
## Min 1Q Median 3Q Max
## -43.72 -14.66 -3.03 11.52 124.66
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.9722 1.3444 18.58 < 2e-16 ***
## TMAX 0.6781 0.0271 25.03 < 2e-16 ***
## month02 -5.9768 1.1604 -5.15 2.7e-07 ***
## month03 -19.3975 1.1672 -16.62 < 2e-16 ***
## month04 -33.0288 1.2871 -25.66 < 2e-16 ***
## month05 -37.9619 1.4304 -26.54 < 2e-16 ***
## month06 -38.8086 1.5925 -24.37 < 2e-16 ***
## month07 -37.7940 1.6857 -22.42 < 2e-16 ***
## month08 -40.4565 1.6563 -24.43 < 2e-16 ***
## month09 -43.7962 1.5422 -28.40 < 2e-16 ***
## month10 -34.8408 1.3458 -25.89 < 2e-16 ***
## month11 -19.9973 1.2209 -16.38 < 2e-16 ***
## month12 -7.9325 1.1470 -6.92 5.1e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20 on 7050 degrees of freedom
## Multiple R-squared: 0.149, Adjusted R-squared: 0.147
## F-statistic: 103 on 12 and 7050 DF, p-value: <2e-16
xkabledply(aqi_fit2, title = paste("Second Linear Model: ", format( formula(aqi_fit2) )))
| Estimate | Std. Error | t value | Pr(>|t|) | |
|---|---|---|---|---|
| (Intercept) | 24.972 | 1.3444 | 18.58 | 0 |
| TMAX | 0.678 | 0.0271 | 25.03 | 0 |
| month02 | -5.977 | 1.1604 | -5.15 | 0 |
| month03 | -19.398 | 1.1672 | -16.62 | 0 |
| month04 | -33.029 | 1.2871 | -25.66 | 0 |
| month05 | -37.962 | 1.4304 | -26.54 | 0 |
| month06 | -38.809 | 1.5925 | -24.37 | 0 |
| month07 | -37.794 | 1.6857 | -22.42 | 0 |
| month08 | -40.456 | 1.6563 | -24.43 | 0 |
| month09 | -43.796 | 1.5422 | -28.40 | 0 |
| month10 | -34.841 | 1.3458 | -25.89 | 0 |
| month11 | -19.997 | 1.2209 | -16.38 | 0 |
| month12 | -7.933 | 1.1470 | -6.92 | 0 |
The regression coefficient for TMAX is significant and positively
correlated, with each degree increase resulting in AQI increasing by a
factor of 0.68. All months, when compared to January, have a negative
impact on AQI, with September having the largest difference. The p-value
of the model’s F-statistic is also significant, allowing us to reject
the null hypothesis and conclude that there’s a significant relationship
between our chosen predictors and the daily AQI value. However, the
\(R^2\) for our model is only
.149, which indicates that only 14.7% of the variation in
daily AQI can be explained by TMAX and month.
Seasonality can cause a poor linear model. Properly testing it would require developing a seasonality time-series model to properly fit the data.
Check for multicollinearity
# model VIF scores
xkablevif(aqi_fit2, title = "Model 2 VIF")
| month02 | month03 | month04 | month05 | month06 | month07 | month08 | month09 | month10 | month11 | month12 | TMAX |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1.78 | 1.98 | 2.32 | 2.88 | 3.36 | 3.79 | 3.59 | 3.01 | 2.38 | 1.96 | 1.84 | 4.25 |
The VIF values of all regressors are acceptable.
A k-NN model can help us further analyze the seasonality effect by
attempting to predict the month based on AQI and
TMAX variables.
Let’s start with plotting variables and discerning month:
ggplot(DailyAQ_merged) +
geom_point(aes(x=AQI, y=TMAX, color=month), alpha = 0.7) +
labs(title = "Daily Maximum Temperature vs Daily Air Quality Index Value Distinguished By Month",
x = "Daily AQI Value",
y = "Daily Maximum Temperature (F)") +
scale_color_brewer(palette = "Paired")
Center and scale our data
DailyAQ_scaled <- as.data.frame(scale(DailyAQ_merged[4:9], center = TRUE, scale = TRUE))
str(DailyAQ_scaled)
## 'data.frame': 7063 obs. of 6 variables:
## $ PM2.5: num 3.839 4.039 2.642 0.727 -0.417 ...
## $ AQI : num 3.282 3.421 2.264 0.876 -0.373 ...
## $ TMAX : num -0.6996 -0.1462 0.0752 -0.1462 -0.8656 ...
## $ TMIN : num -0.872 -0.328 0.155 -0.147 -1.173 ...
## $ PRCP : num -0.356 -0.356 -0.356 1.502 -0.356 ...
## $ SNOW : num -0.113 -0.113 -0.113 -0.113 -0.113 ...
Create train and test data sets with 4:1 splits, as well as label sets.
set.seed(1000)
DailyAQ_sample <- sample(2, nrow(DailyAQ_scaled), replace=TRUE, prob=c(0.80, 0.20))
DailyAQ_training <- DailyAQ_scaled[DailyAQ_sample == 1, ]
DailyAQ_test <- DailyAQ_scaled[DailyAQ_sample == 2, ]
DailyAQ_trainLabels <- DailyAQ_merged[DailyAQ_sample == 1, 3]
DailyAQ_testLabels <- DailyAQ_merged[DailyAQ_sample == 2, 3]
nrow(DailyAQ_training)
## [1] 5664
nrow(DailyAQ_test)
## [1] 1399
Build kNN model
# set kval
kval <- 3
# build model
DailyAQ_pred <- knn(train = DailyAQ_training,
test = DailyAQ_test,
cl = DailyAQ_trainLabels,
k = kval)
# confusion matrix
DailyAQ_confusionMatrix <- caret::confusionMatrix(DailyAQ_pred, reference = DailyAQ_testLabels)
DailyAQ_pred_accuracy <- DailyAQ_confusionMatrix$overall['Accuracy']
xkabledply(as.matrix(DailyAQ_confusionMatrix), title = paste("ConfusionMatrix for k = ", kval))
| 01 | 02 | 03 | 04 | 05 | 06 | 07 | 08 | 09 | 10 | 11 | 12 | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 01 | 60 | 38 | 16 | 7 | 1 | 0 | 0 | 0 | 0 | 5 | 13 | 50 |
| 02 | 24 | 33 | 38 | 10 | 0 | 0 | 0 | 0 | 0 | 4 | 18 | 18 |
| 03 | 14 | 13 | 37 | 26 | 7 | 1 | 0 | 0 | 2 | 11 | 27 | 11 |
| 04 | 2 | 3 | 20 | 52 | 22 | 6 | 2 | 2 | 3 | 36 | 25 | 6 |
| 05 | 0 | 1 | 2 | 20 | 38 | 19 | 10 | 8 | 28 | 27 | 7 | 0 |
| 06 | 0 | 0 | 0 | 1 | 8 | 39 | 34 | 25 | 25 | 7 | 1 | 0 |
| 07 | 0 | 0 | 0 | 0 | 4 | 15 | 41 | 34 | 13 | 3 | 0 | 0 |
| 08 | 0 | 0 | 0 | 2 | 2 | 15 | 23 | 24 | 13 | 3 | 0 | 0 |
| 09 | 0 | 0 | 1 | 1 | 9 | 14 | 9 | 10 | 17 | 4 | 1 | 0 |
| 10 | 0 | 3 | 2 | 6 | 8 | 5 | 1 | 0 | 3 | 24 | 8 | 1 |
| 11 | 2 | 1 | 9 | 6 | 4 | 1 | 0 | 0 | 0 | 6 | 15 | 6 |
| 12 | 16 | 13 | 9 | 3 | 0 | 0 | 0 | 0 | 0 | 2 | 5 | 19 |
xkabledply(data.frame(DailyAQ_confusionMatrix$byClass), title=paste("k = ", kval))
| Sensitivity | Specificity | Pos.Pred.Value | Neg.Pred.Value | Precision | Recall | F1 | Prevalence | Detection.Rate | Detection.Prevalence | Balanced.Accuracy | |
|---|---|---|---|---|---|---|---|---|---|---|---|
| Class: 01 | 0.508 | 0.898 | 0.316 | 0.952 | 0.316 | 0.508 | 0.390 | 0.0843 | 0.0429 | 0.1358 | 0.704 |
| Class: 02 | 0.314 | 0.913 | 0.228 | 0.943 | 0.228 | 0.314 | 0.264 | 0.0751 | 0.0236 | 0.1036 | 0.614 |
| Class: 03 | 0.276 | 0.911 | 0.248 | 0.922 | 0.248 | 0.276 | 0.262 | 0.0958 | 0.0264 | 0.1065 | 0.594 |
| Class: 04 | 0.388 | 0.900 | 0.290 | 0.933 | 0.290 | 0.388 | 0.332 | 0.0958 | 0.0372 | 0.1279 | 0.644 |
| Class: 05 | 0.369 | 0.906 | 0.237 | 0.948 | 0.237 | 0.369 | 0.289 | 0.0736 | 0.0272 | 0.1144 | 0.637 |
| Class: 06 | 0.339 | 0.921 | 0.279 | 0.940 | 0.279 | 0.339 | 0.306 | 0.0822 | 0.0279 | 0.1001 | 0.630 |
| Class: 07 | 0.342 | 0.946 | 0.373 | 0.939 | 0.373 | 0.342 | 0.356 | 0.0858 | 0.0293 | 0.0786 | 0.644 |
| Class: 08 | 0.233 | 0.955 | 0.293 | 0.940 | 0.293 | 0.233 | 0.260 | 0.0736 | 0.0172 | 0.0586 | 0.594 |
| Class: 09 | 0.164 | 0.962 | 0.258 | 0.935 | 0.258 | 0.164 | 0.200 | 0.0743 | 0.0122 | 0.0472 | 0.563 |
| Class: 10 | 0.182 | 0.971 | 0.393 | 0.919 | 0.393 | 0.182 | 0.249 | 0.0944 | 0.0172 | 0.0436 | 0.576 |
| Class: 11 | 0.125 | 0.973 | 0.300 | 0.922 | 0.300 | 0.125 | 0.176 | 0.0858 | 0.0107 | 0.0357 | 0.549 |
| Class: 12 | 0.171 | 0.963 | 0.284 | 0.931 | 0.284 | 0.171 | 0.213 | 0.0793 | 0.0136 | 0.0479 | 0.567 |
How does k affect classification accuracy?
evaluateK = function(k, train_set, val_set, train_class, val_class){
# Build knn with k neighbors considered.
set.seed(1000)
class_knn = knn(train = train_set, #<- training set cases
test = val_set, #<- test set cases
cl = train_class, #<- category for classification
k = k) #<- number of neighbors considered
tab = table(class_knn, val_class)
# Calculate the accuracy.
accu = sum(tab[row(tab) == col(tab)]) / sum(tab)
cbind(k = k, accuracy = accu)
}
# call evaluateK function for each odd k-value between 1 to 21
knn_different_k = sapply(seq(1, 21, by = 2),
function(x) evaluateK(x,
train_set = DailyAQ_training,
val_set = DailyAQ_test,
train_class = DailyAQ_trainLabels,
val_class = DailyAQ_testLabels))
# Reformat the results
knn_different_k = data.frame(k = knn_different_k[1,],
accuracy = knn_different_k[2,])
ggplot(knn_different_k, aes(x = k, y = accuracy)) +
geom_line(color = "orange", size = 1.5) +
geom_point(size = 3) +
labs(title = "kNN Model Accuracy vs k-value",
x = "Model k-value",
y = "Accuracy")
It seems 13-nearest neighbors is a decent choice because that’s the
greatest improvement in predictive accuracy before the incremental
improvement trails off. With an accuracy of 0.32, our model predicting
month based on TMAX and AQI is not a strong
fit.